This is an interactive visualization representing the frequency of delays by subway line.
Packages (need to install gt package to use gt library first)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(janitor)
##
## Attaching package: 'janitor'
##
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(viridis)
## Loading required package: viridisLite
library(plotly)
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
library(knitr)
library(gt)
Load data.
raw <- read_csv("data/2019_subway_rider_data.csv") |>
clean_names()
## Rows: 10704 Columns: 19
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (18): subway_line_used_most_often, use_of_subway_frequency, get_to_subwa...
## lgl (1): is_subway_affordable
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Clean dataset.
# split multi-line answers (e.g., "A/C" → "A", "C")
cleaned <- raw |>
mutate(
subway_line = subway_line_used_most_often |>
str_replace_all("[^A-Za-z0-9]", "/") |> # normalize separators
str_split("/") # split into multiple lines
) |>
unnest(subway_line) |>
mutate(
subway_line = str_trim(subway_line),
subway_line = str_to_upper(subway_line)
) |>
filter(subway_line %in% c(
"1","2","3","4","5","6","7",
"A","B","C","D","E","F","M",
"G","J","Z","N","Q","R","W","L","S"
))
# Convert column naming to an ordered factor for plotting:
cleaned <- cleaned |>
mutate(
frequency_of_delays = factor(
frequency_of_delays,
levels = c(
"Never",
"Rarely",
"Once a month",
"A few times a month",
"A few times a week",
"Everyday"
),
ordered = TRUE
)
)
#convert to a numeric midpoint in minutes (for average delay per line):
cleaned <- cleaned |>
mutate(
delay_min = case_when(
approximate_delay_duration == "< 10 min" ~ 5,
approximate_delay_duration == "10 - 20 min" ~ 15,
approximate_delay_duration == "20 - 40 min" ~ 30,
approximate_delay_duration == "40 - 60 min" ~ 50,
approximate_delay_duration == "Over 60 min" ~ 70,
approximate_delay_duration == "Over 40 min" ~ 50,
TRUE ~ NA_real_
)
)
Display table of Subway Line Directory:
subway_directory_display <- tribble(
~`Line Family`, ~Lines, ~Color,
"Red Line", "1, 2, 3", "#EE352E",
"Green Line", "4, 5, 6", "#00933C",
"Purple Line", "7", "#B933AD",
"Blue Line", "A, C, E", "#0039A6",
"Orange Line", "B, D, F, M", "#FF6319",
"Yellow Line", "N, Q, R, W", "#FCCC0A",
"Light Green Line", "G", "#6CBE45",
"Brown Line", "J, Z", "#996633",
"Gray Line", "L", "#A7A9AC",
"Shuttle", "S", "#808183"
)
subway_directory_display |>
gt() |>
tab_header(
title = "NYC Subway Line Directory"
) |>
text_transform(
locations = cells_body(columns = Color),
fn = function(color) {
paste0(
"<div style='background-color:", color,
"; width: 60px; height: 20px; border-radius: 4px'></div>"
)
}
) |>
cols_width(
everything() ~ px(180)
) |>
opt_table_outline()
| NYC Subway Line Directory | ||
| Line Family | Lines | Color |
|---|---|---|
| Red Line | 1, 2, 3 | |
| Green Line | 4, 5, 6 | |
| Purple Line | 7 | |
| Blue Line | A, C, E | |
| Orange Line | B, D, F, M | |
| Yellow Line | N, Q, R, W | |
| Light Green Line | G | |
| Brown Line | J, Z | |
| Gray Line | L | |
| Shuttle | S | |
Join-ready Subway Line Directory for analysis:
subway_directory <- tribble(
~subway_line, ~group_color, ~hex_color,
"1","Red Line","#EE352E",
"2","Red Line","#EE352E",
"3","Red Line","#EE352E",
"4","Green Line","#00933C",
"5","Green Line","#00933C",
"6","Green Line","#00933C",
"7","Purple Line","#B933AD",
"A","Blue Line","#0039A6",
"C","Blue Line","#0039A6",
"E","Blue Line","#0039A6",
"B","Orange Line","#FF6319",
"D","Orange Line","#FF6319",
"F","Orange Line","#FF6319",
"M","Orange Line","#FF6319",
"N","Yellow Line","#FCCC0A",
"Q","Yellow Line","#FCCC0A",
"R","Yellow Line","#FCCC0A",
"W","Yellow Line","#FCCC0A",
"G","Light Green Line","#6CBE45",
"J","Brown Line","#996633",
"Z","Brown Line","#996633",
"L","Gray Line","#A7A9AC",
"S","Shuttle","#808183"
)
cleaned <- cleaned |>
left_join(subway_directory, by = "subway_line")
Frequency of delays across all trains. Interactive Plotly Bar Chart
freq_by_line <- cleaned |>
drop_na(frequency_of_delays) |>
count(subway_line, frequency_of_delays) |>
group_by(subway_line) |>
summarize(delay_score = sum(as.numeric(frequency_of_delays))) |>
left_join(subway_directory, by = "subway_line")
# Group bars together by color family matching the directory order
freq_by_line <- cleaned |>
drop_na(frequency_of_delays) |>
count(subway_line, frequency_of_delays) |>
group_by(subway_line) |>
summarize(delay_score = sum(as.numeric(frequency_of_delays))) |>
left_join(subway_directory, by = "subway_line") |>
arrange(group_color, subway_line) |>
mutate(
subway_line = factor(subway_line, levels = subway_line)
)
plot_ly(
data = freq_by_line,
x = ~subway_line,
y = ~delay_score,
type = "bar",
marker = list(color = ~hex_color),
# Send group_color to the hovertemplate
customdata = ~group_color,
# Hover box formatting
hovertemplate = paste(
"<b>Subway Line:</b> %{x}<br>",
"<b>Delay Frequency Score:</b> %{y}<br>",
"<extra></extra>" # removes default tooltip
)
) |>
layout(
title = "Frequency of Reported Delays by Subway Line",
xaxis = list(title = "Subway Line"),
yaxis = list(title = "Delay Frequency Score"),
showlegend = FALSE
)
Frequency of Delays Across Line Family
freq_by_group <- cleaned |>
drop_na(frequency_of_delays) |>
count(group_color, frequency_of_delays) |>
group_by(group_color) |>
summarize(delay_score = sum(as.numeric(frequency_of_delays))) |>
left_join(
subway_directory |> distinct(group_color, hex_color),
by = "group_color"
)
plot_ly(
data = freq_by_group,
x = ~group_color,
y = ~delay_score,
type = "bar",
marker = list(color = ~hex_color),
# add hovertemplate
hovertemplate = paste(
"<b>Color Group:</b> %{x}<br>",
"<b>Delay Frequency Score:</b> %{y}<br>",
"<extra></extra>"
)
) |>
layout(
title = "Frequency of Reported Delays by Subway Color Group",
xaxis = list(title = "Color Group"),
yaxis = list(title = "Delay Frequency Score"),
showlegend = FALSE
)
Proportion of Riders Reporting Long Delays (≥20 min) by subway line:
# Define long-delay categories
long_categories <- c(
"20 - 40 min", "20 - 45 min",
"40 - 60 min", "45 - 60 min",
"> 60 min"
)
# Create an ordering by line family
line_order <- subway_directory |>
arrange(group_color, subway_line) |>
pull(subway_line)
# Build long delay dataset
long_delay_by_line <- cleaned |>
filter(!is.na(approximate_delay_duration)) |>
mutate(
long_delay = approximate_delay_duration %in% long_categories
) |>
count(subway_line, long_delay) |>
group_by(subway_line) |>
mutate(prop = n / sum(n)) |>
filter(long_delay == TRUE) |>
select(subway_line, prop) |>
left_join(subway_directory, by = "subway_line") |>
mutate(
subway_line = factor(subway_line, levels = line_order),
group_color = factor(group_color, levels = unique(subway_directory$group_color))
)
# Now plot
plot_ly(
data = long_delay_by_line,
x = ~subway_line,
y = ~prop,
type = "bar",
marker = list(color = ~hex_color),
customdata = ~group_color,
hovertemplate = paste(
"<b>Subway Line:</b> %{x}<br>",
"<b>Proportion Long Delays (≥20 min):</b> %{y:.2f}<br>",
"<b>Color Group:</b> %{customdata}",
"<extra></extra>"
)
) |>
layout(
title = list(
text = "<b>Proportion of Riders Reporting Long Delays (≥ 20 min) by Subway Line</b>",
y = 0.95
),
xaxis = list(
title = "Subway Line",
tickangle = 0,
categoryorder = "array",
categoryarray = line_order
),
yaxis = list(
title = "Proportion of Long Delays",
tickformat = ".0%",
range = c(0, max(long_delay_by_line$prop) * 1.1)
),
bargap = 0.20, # spacing between groups
bargroupgap = 0.05 # spacing within groups
)
## Warning: 'layout' objects don't have these attributes: 'bargroupgap'
## Valid attributes include:
## '_deprecated', 'activeshape', 'annotations', 'autosize', 'autotypenumbers', 'calendar', 'clickmode', 'coloraxis', 'colorscale', 'colorway', 'computed', 'datarevision', 'dragmode', 'editrevision', 'editType', 'font', 'geo', 'grid', 'height', 'hidesources', 'hoverdistance', 'hoverlabel', 'hovermode', 'images', 'legend', 'mapbox', 'margin', 'meta', 'metasrc', 'modebar', 'newshape', 'paper_bgcolor', 'plot_bgcolor', 'polar', 'scene', 'selectdirection', 'selectionrevision', 'separators', 'shapes', 'showlegend', 'sliders', 'smith', 'spikedistance', 'template', 'ternary', 'title', 'transition', 'uirevision', 'uniformtext', 'updatemenus', 'width', 'xaxis', 'yaxis', 'boxmode', 'barmode', 'bargap', 'mapType'
Heat map of ordinal frequency of delay responses
# keep only the meaningful delay levels
keep_delays <- c(
"Rarely",
"A few times a week",
"Everyday"
)
heat_data <- cleaned |>
filter(!is.na(frequency_of_delays)) |>
filter(frequency_of_delays %in% keep_delays) |>
count(subway_line, frequency_of_delays) |>
tidyr::complete(
subway_line = unique(subway_directory$subway_line),
frequency_of_delays = keep_delays,
fill = list(n = 0)
) |>
group_by(subway_line) |>
mutate(prop = n / sum(n)) |>
ungroup() |>
left_join(subway_directory, by = "subway_line") |>
arrange(group_color, subway_line) |>
mutate(
subway_line = factor(subway_line, levels = unique(subway_line)),
frequency_of_delays = factor(frequency_of_delays, levels = keep_delays)
)
# Heatmap (now only 3 categories)
plot_ly(
data = heat_data,
x = ~frequency_of_delays,
y = ~subway_line,
z = ~prop,
type = "heatmap",
colors = viridisLite::viridis(100),
customdata = ~n,
hovertemplate = paste(
"<b>Line:</b> %{y}<br>",
"<b>Delay Frequency:</b> %{x}<br>",
"<b>Proportion:</b> %{z:.1%}<br>",
"<b>Count:</b> %{customdata}<br>",
"<extra></extra>"
)
) |>
layout(
title = "Delay Frequency Distribution by Subway Line",
xaxis = list(
title = "Delay Frequency",
tickangle = 0,
categoryorder = "array",
categoryarray = keep_delays
),
yaxis = list(
title = "Subway Line",
categoryorder = "array",
categoryarray = unique(heat_data$subway_line)
),
margin = list(l = 90, r = 20, b = 80, t = 80),
paper_bgcolor = "white",
plot_bgcolor = "white"
)